suppressMessages(library(readxl))
## Warning: package 'readxl' was built under R version 3.6.3
suppressMessages(library(plotly))
## Warning: package 'plotly' was built under R version 3.6.3
suppressMessages(library(randomForest))
## Warning: package 'randomForest' was built under R version 3.6.3
suppressMessages(library(dplyr))
suppressMessages(library(GGally))
suppressMessages(library(ggplot2))
suppressMessages(library(scatterplot3d))

########################################################
## Clean Data
##
## Delete the data which height and weight = 'NA' (Invalid BMI data)
########################################################

dat_ex_all <- read_excel("C:/Users/wangy/Documents/Study/ST606/fit_database_anthropometric_all.xlsx", sheet=1, na='NA')

# Clean data (delete all data that the follow conditions are NA)
dat <- subset(dat_ex_all, `height (cm)` != 'NA' 
                & `weight (kg)` != 'NA')

dat$mYear <- substr(as.character(dat$`measurement date`), start=1, stop=4)

colnames(dat)[10] <- 'z-category'
dataSta <- data.frame(all = nrow(dat_ex_all), obj = nrow(dat), del = nrow(dat_ex_all) - nrow(dat)) 

Distribution of BMI

Different gender’s data is almost similar.

########################################################
## Distribution of BMI
##
## Different gender's data is almost similar.
########################################################
# BMI's distribution
p <- ggplot(data = dat) +
geom_histogram(mapping = aes(x = BMI), color="black", fill="lightblue")
p
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# BMI by gender
p+ facet_wrap(~gender)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = dat) + geom_density(mapping = aes(x = BMI))

########################################################
## z-category by gender
##
## Normal: boy < girl
## Overweight: boy < girl
## Obese: boy > girl
## thin: boy > girl
## severely thin: boy > girl
## 
## Normal > Overweight > Obese > thin >severely thin
########################################################

# ALL
dat$`z-category` = factor(dat$`z-category`, levels=c('normal','overweight','obese','thin','severely thin'))

ggplot(data = dat) + geom_bar(mapping = aes(x = `z-category`, fill=gender))

dat_boy <- subset(dat, gender == 'boy')
dat_girl <- subset(dat, gender == 'girl')
tab_a <- table(dat$`z-category`)
tab_b <- table(dat_boy$`z-category`)
tab_g <- table(dat_girl$`z-category`)

cbind('All' = tab_a, 'Boy' = tab_b, 'Girl' = tab_g, 'BoyVsGirl' = tab_b - tab_g)
##                 All   Boy  Girl BoyVsGirl
## normal        65345 31689 33656     -1967
## overweight    15708  7606  8102      -496
## obese         11379  6453  4926      1527
## thin           2051  1145   906       239
## severely thin   221   138    83        55
ggplot(data = dat_boy) +
geom_bar(mapping = aes(x = "", fill=`z-category`), position = "fill", width=1) + 
ggtitle("Z-category(Boy)") +
coord_polar("y")+
theme( axis.title=element_blank(),
axis.text=element_blank(),
axis.ticks=element_blank())

ggplot(data = dat_girl) +
geom_bar(mapping = aes(x = "", fill=`z-category`), position = "fill", width=1) + 
ggtitle("Z-category(Girl)") +
coord_polar("y")+
theme( axis.title=element_blank(),
axis.text=element_blank(),
axis.ticks=element_blank())

########################################################
## Z-category ~ Height + Weight + age + gender
## 
## cluster relationship
########################################################

ggplot(data = dat) + geom_point(mapping = aes(x = `weight (kg)`, y = `height (cm)`, color=`z-category`))

fig_boy <- plot_ly(dat_boy, x = ~`weight (kg)`, y = ~`height (cm)`, z = ~`age (years)`, color = ~`z-category`)
fig_boy <- fig_boy %>% add_markers()
fig_boy
fig_girl <- plot_ly(dat_girl, x = ~`weight (kg)`, y = ~`height (cm)`, z = ~`age (years)`, color = ~`z-category`)
fig_girl <- fig_girl %>% add_markers()
fig_girl
########################################################
## Predict Z-category ~ height + weight + age + gender (Machine Learning)
##  
## Sample 80% as training data
## Sample 20% as test data
## Use randomForest algorithm to learn the function
## then use the test data to predict the z-category and compare the actual result to chect the funtion
########################################################

dat_learn <- dat

## change gender to number (boy:1, girl:0)
dat_learn$gender[dat_learn$gender == "boy"] <- 1
dat_learn$gender[dat_learn$gender == "girl"] <- 0
dat_learn$gender <- as.numeric(dat_learn$gender)

## measurement date,age (years), age bin have strong Collinearity. So I only remain the age(years)
dat_learn <- cbind(dat_learn[, 3], dat_learn[, 5:7], dat_learn[, 10]) 
names(dat_learn)[1:4]<-c("age", "gender", "height", "weight")

## sampling 80% for training, 20% for test.
ind <- sample(c(1,2), nrow(dat_learn), replace=T, prob = c(0.8, 0.2))
## training data
trainData <- dat_learn[ind == 1,]
## test data
testData <- dat_learn[ind == 2,]

set.seed(500)
ez.forest <- randomForest(`z-category` ~., data = trainData,
                          na.action = na.roughfix,
                          importance = TRUE)
ez.forest
## 
## Call:
##  randomForest(formula = `z-category` ~ ., data = trainData, importance = TRUE,      na.action = na.roughfix) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 2%
## Confusion matrix:
##               normal overweight obese thin severely thin class.error
## normal         51918        270     0   61             0 0.006335049
## overweight       387      11946   184    0             0 0.045617960
## obese              0        256  8833    0             0 0.028165915
## thin             246          0     0 1405             4 0.151057402
## severely thin      1          0     0  102            65 0.613095238
importance(ez.forest,type=2)
##        MeanDecreaseGini
## age           6484.7467
## gender         827.4697
## height        8606.5229
## weight       19803.8608
forest.pred <- predict(ez.forest, testData)
forest.perf <- table(testData$`z-category`, forest.pred,
                     dnn = c('Actual','Predicted'))
forest.perf
##                Predicted
## Actual          normal overweight obese  thin severely thin
##   normal         12997         77     0    22             0
##   overweight       106       3023    62     0             0
##   obese              0         77  2213     0             0
##   thin              62          0     0   334             0
##   severely thin      1          0     0    27            25